1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Data from the speech features

1.2 The Data


pd_speech_features <- as.data.frame(read_excel("~/GitHub/FCA/Data/pd_speech_features.xlsx",sheet = "pd_speech_features", range = "A2:ACB758"))

1.2.1 The Average of the Three Repetitions

Each subject had three repeated observations. Here I’ll use the average of the three experiments per subject.

rep1Parkison <- subset(pd_speech_features,RID==1)
rownames(rep1Parkison) <- rep1Parkison$id
rep1Parkison$id <- NULL
rep1Parkison$RID <- NULL
rep1Parkison[,1:ncol(rep1Parkison)] <- sapply(rep1Parkison,as.numeric)

rep2Parkison <- subset(pd_speech_features,RID==2)
rownames(rep2Parkison) <- rep2Parkison$id
rep2Parkison$id <- NULL
rep2Parkison$RID <- NULL
rep2Parkison[,1:ncol(rep2Parkison)] <- sapply(rep2Parkison,as.numeric)

rep3Parkison <- subset(pd_speech_features,RID==3)
rownames(rep3Parkison) <- rep3Parkison$id
rep3Parkison$id <- NULL
rep3Parkison$RID <- NULL
rep3Parkison[,1:ncol(rep3Parkison)] <- sapply(rep3Parkison,as.numeric)

whof <- !(colnames(rep1Parkison) %in% c("gender","class"));
avgParkison <- rep1Parkison;
avgParkison[,whof] <- (rep1Parkison[,whof] + rep2Parkison[,whof] + rep3Parkison[,whof])/3


signedlog <- function(x) { return (sign(x)*log(abs(1.0e12*x)+1.0))}
whof <- !(colnames(avgParkison) %in% c("gender","class"));
avgParkison[,whof] <- signedlog(avgParkison[,whof])

1.2.1.1 Standarize the names for the reporting

studyName <- "Parkinsons"
dataframe <- avgParkison
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
252 753
pander::pander(table(dataframe[,outcome]))
0 1
64 188

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1000

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]


dataframe <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data


if (!largeSet)
{
  
  hm <- heatMaps(data=dataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9999953

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 681 , Uni p: 0.01697637 , Uncorrelated Base: 183 , Outcome-Driven Size: 0 , Base Size: 183 
#> 
#> 
 1 <R=1.000,w=  1,N=  358>, Top: 85( 42 )[ 1 : 85 : 0.975 ]( 83 , 229 , 0 ),<|>Tot Used: 312 , Added: 229 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,w=  1,N=  358>, Top: 21( 7 )[ 1 : 21 : 0.975 ]( 21 , 45 , 83 ),<|>Tot Used: 345 , Added: 45 , Zero Std: 0 , Max Cor: 0.996
#> 
 3 <R=0.996,w=  1,N=  358>, Top: 11( 4 )[ 1 : 11 : 0.973 ]( 11 , 14 , 104 ),<|>Tot Used: 357 , Added: 14 , Zero Std: 0 , Max Cor: 0.973
#> 
 4 <R=0.973,w=  2,N=  210>, Top: 72( 2 )[ 1 : 72 : 0.937 ]( 71 , 104 , 109 ),<|>Tot Used: 436 , Added: 104 , Zero Std: 0 , Max Cor: 0.968
#> 
 5 <R=0.968,w=  2,N=  210>, Top: 13( 2 )[ 1 : 13 : 0.934 ]( 13 , 19 , 153 ),<|>Tot Used: 446 , Added: 19 , Zero Std: 0 , Max Cor: 0.974
#> 
 6 <R=0.974,w=  3,N=  160>, Top: 56( 1 )[ 1 : 56 : 0.887 ]( 55 , 82 , 159 ),<|>Tot Used: 484 , Added: 82 , Zero Std: 0 , Max Cor: 0.964
#> 
 7 <R=0.964,w=  3,N=  160>, Top: 14( 1 )[ 1 : 14 : 0.882 ]( 14 , 14 , 189 ),<|>Tot Used: 485 , Added: 14 , Zero Std: 0 , Max Cor: 0.882
#> 
 8 <R=0.882,w=  4,N=  191>, Top: 70( 3 )=( 1 )[ 2 : 70 : 0.813 ]( 68 , 94 , 198 ),<|>Tot Used: 521 , Added: 94 , Zero Std: 0 , Max Cor: 0.964
#> 
 9 <R=0.964,w=  4,N=  191>, Top: 9( 1 )[ 1 : 9 : 0.832 ]( 9 , 9 , 235 ),<|>Tot Used: 524 , Added: 9 , Zero Std: 0 , Max Cor: 0.829
#> 
 10 <R=0.829,w=  5,N=   12>, Top: 5( 2 )[ 1 : 5 : 0.800 ]( 5 , 7 , 239 ),<|>Tot Used: 525 , Added: 7 , Zero Std: 0 , Max Cor: 0.800
#> 
 11 <R=0.000,w=  5,N=   12>
#> 
 [ 11 ], 0.7995728 Decor Dimension: 525 . Cor to Base: 253 , ABase: 35 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

718

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

310

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.83

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

3.49

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe,
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.7995728

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data

classes <- unique(dataframe[,outcome])
raincolors <- rainbow(length(classes))
names(raincolors) <- classes
datasetframe.umap = umap(scale(dataframe[,varlist]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
text(datasetframe.umap$layout,labels=dataframe[,outcome],col=raincolors[dataframe[,outcome]+1])

1.8.2 The decorralted UMAP


datasetframe.umap = umap(scale(DEdataframe[,varlistc]),n_components=2)
plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
text(datasetframe.umap$layout,labels=DEdataframe[,outcome],col=raincolors[DEdataframe[,outcome]+1])

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : std_MFCC_2nd_coef 200 : app_entropy_log_3_coef 300 : app_LT_TKEO_mean_7_coef 400 : tqwt_entropy_log_dec_15 500 : tqwt_medianValue_dec_7
600 : tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_std_MFCC_2nd_coef 200 : La_app_entropy_log_3_coef 300 : La_app_LT_TKEO_mean_7_coef 400 : La_tqwt_entropy_log_dec_15 500 : tqwt_medianValue_dec_7
600 : La_tqwt_stdValue_dec_35 700 : tqwt_skewnessValue_dec_27

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
std_delta_delta_log_energy 0.251 0.825 -0.736 0.804 0.483 0.798
std_delta_log_energy 0.250 0.841 -0.690 0.787 0.469 0.794
std_9th_delta_delta 0.347 0.952 -0.611 0.674 0.766 0.787
std_8th_delta_delta 0.319 0.941 -0.598 0.595 0.862 0.780
std_7th_delta_delta 0.324 0.905 -0.558 0.647 0.977 0.776
tqwt_entropy_log_dec_12 -0.147 0.876 0.764 0.876 0.676 0.770
std_6th_delta_delta 0.311 0.851 -0.470 0.540 0.896 0.768
std_8th_delta 0.310 0.950 -0.587 0.637 0.971 0.767
std_9th_delta 0.306 0.885 -0.519 0.660 0.330 0.764
tqwt_entropy_shannon_dec_12 -0.282 0.940 0.593 0.833 0.145 0.763


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
std_delta_log_energy 0.2502 0.841 -0.690 0.787 4.69e-01 0.794
std_9th_delta 0.3061 0.885 -0.519 0.660 3.30e-01 0.764
mean_MFCC_2nd_coef -0.3598 1.433 -1.933 1.997 2.87e-06 0.753
La_tqwt_energy_dec_33 -0.0777 0.295 0.230 0.420 8.62e-01 0.749
tqwt_entropy_shannon_dec_11 -0.3083 0.958 0.504 0.884 5.34e-01 0.744
La_std_3rd_delta 0.1244 0.356 -0.216 0.488 9.37e-02 0.736
La_tqwt_entropy_shannon_dec_17 -0.1131 0.508 0.135 0.175 6.94e-01 0.734
tqwt_kurtosisValue_dec_36 0.1302 0.643 -0.408 0.545 1.62e-02 0.733
La_tqwt_kurtosisValue_dec_33 0.0505 0.377 -0.301 0.511 1.54e-01 0.732
La_apq11Shimmer 0.0392 0.289 -0.173 0.242 3.07e-01 0.732

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.46 466 0.626


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
std_delta_log_energy 0.2502 0.841 -0.6902 0.787 4.69e-01 0.794 0.794 2
std_9th_delta 0.3061 0.885 -0.5192 0.660 3.30e-01 0.764 0.764 3
mean_MFCC_2nd_coef -0.3598 1.433 -1.9334 1.997 2.87e-06 0.753 0.753 NA
La_tqwt_energy_dec_33 -1.046tqwt_energy_dec_31 + 1.000tqwt_energy_dec_33 -0.0777 0.295 0.2303 0.420 8.62e-01 0.749 0.509 1
tqwt_entropy_shannon_dec_11 -0.3083 0.958 0.5035 0.884 5.34e-01 0.744 0.744 7
La_std_3rd_delta -0.971std_MFCC_3rd_coef + 1.000std_3rd_delta 0.1244 0.356 -0.2163 0.488 9.37e-02 0.736 0.645 0
La_tqwt_entropy_shannon_dec_17 + 1.000tqwt_entropy_shannon_dec_17 + 0.990tqwt_minValue_dec_17 -0.1131 0.508 0.1349 0.175 6.94e-01 0.734 0.709 -1
tqwt_kurtosisValue_dec_36 0.1302 0.643 -0.4081 0.545 1.62e-02 0.733 0.733 NA
La_tqwt_kurtosisValue_dec_33 -0.809tqwt_kurtosisValue_dec_31 + 1.000tqwt_kurtosisValue_dec_33 0.0505 0.377 -0.3010 0.511 1.54e-01 0.732 0.628 -1
La_apq11Shimmer -0.945locShimmer + 1.000apq11Shimmer 0.0392 0.289 -0.1725 0.242 3.07e-01 0.732 0.713 -1
apq11Shimmer NA 0.1867 0.824 -0.5312 0.974 5.54e-01 0.713 0.713 NA
tqwt_entropy_shannon_dec_17 NA -0.5108 1.187 0.2570 0.652 7.06e-03 0.709 0.709 NA
locShimmer NA 0.1561 0.851 -0.3796 1.006 9.39e-01 0.663 0.663 5
std_3rd_delta NA 0.3138 1.025 -0.2539 0.704 8.27e-01 0.645 0.645 NA
tqwt_minValue_dec_17 NA 0.4016 1.110 -0.1232 0.652 2.66e-02 0.636 0.636 12
tqwt_kurtosisValue_dec_33 NA 0.2156 0.824 -0.1164 0.756 5.93e-02 0.628 0.628 NA
tqwt_energy_dec_31 NA 0.0548 0.802 -0.1159 0.976 2.81e-01 0.580 0.580 3
std_MFCC_3rd_coef NA 0.1950 0.969 -0.0388 0.722 8.12e-01 0.552 0.552 1
tqwt_energy_dec_33 NA -0.0204 0.897 0.1091 1.123 2.42e-01 0.509 0.509 NA
tqwt_kurtosisValue_dec_31 NA 0.2041 0.837 0.2282 0.904 1.27e-01 0.490 0.490 3